# Not Built-In Libraries
try:
from yellowbrick.target import ClassBalance
# from sklearnex import patch_sklearn
# patch_sklearn()
except ModuleNotFoundError:
!pip install yellowbrick
!pip install scikit-learn-intelex
from yellowbrick.target import ClassBalance
from sklearnex import patch_sklearn
patch_sklearn()
Just for Windows users.
import os
os.environ["OMP_NUM_THREADS"] = '4'
import numpy as np
import pickle
import random
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import seaborn as sns
import pandas as pd
import warnings
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score
from visutils import *
from visutils import plot_dense_keypoints
from utils import create_dense_kp
import optuna
from sklearn import metrics
import plotly
from utils import ClassifierBoVW, run_model_with_params, randint, cross_validate
from dataset import Dataset
from IPython.display import clear_output
PLAY_MODE = 0 # if 1, use less samples than the total amount. For playing around and trying things.
RANDOM_SEED = 42
TEST_N_SAMPLES = 120
IMSIZE = 128 # we slightly downsample the images to compensate for our lack of compute power.
First, we will show a brief visualization summarizing the categories (labels) of the dataset and its distribution.
train_images_filenames = pickle.load(open('../MIT_split/train_images_filenames.dat', 'rb'))
test_images_filenames = pickle.load(open('../MIT_split/test_images_filenames.dat', 'rb'))
train_images_filenames = ['..' + n[15:] for n in train_images_filenames]
test_images_filenames = ['..' + n[15:] for n in test_images_filenames]
train_labels = pickle.load(open('../MIT_split/train_labels.dat', 'rb'))
test_labels = pickle.load(open('../MIT_split/test_labels.dat', 'rb'))
if PLAY_MODE:
idxs_train = np.random.choice(len(train_labels), TEST_N_SAMPLES, replace=False)[:TEST_N_SAMPLES]
idxs_test = np.random.choice(len(test_labels), TEST_N_SAMPLES, replace=False)[:TEST_N_SAMPLES]
train_images_filenames = np.array(train_images_filenames)[idxs_train].tolist()
test_images_filenames = np.array(test_images_filenames)[idxs_test].tolist()
train_labels = np.array(train_labels)[idxs_train].tolist()
test_labels = np.array(test_labels)[idxs_test].tolist()
train_dataset = Dataset(train_images_filenames, train_labels, (IMSIZE, IMSIZE))
test_dataset = Dataset(test_images_filenames, test_labels, (IMSIZE, IMSIZE))
def _get_data_p(labels_list, labels):
return [labels_list.count(x) for x in labels]
def _get_data_relative(labels_list, labels):
return np.array([labels_list.count(x)/len(labels_list) * 100 for x in labels])
def _get_nested_data(train, test, labels):
test = _get_data_p(test, labels)
train = _get_data_p(train, labels)
return np.array([[x, y] for x, y in zip(train, test)])
fig, main_axs = plt.subplots(nrows = 2, ncols=4, figsize = (32, 16))
mako_cmap = (sns.color_palette("mako", as_cmap=True))
sns.set(palette = 'mako')
labels = list(set(test_labels+train_labels))
#### BAR CHART USING YELLOWBRICK ####
axes = main_axs[0]
axes[2].set_ylim([0, 300])
axes[1].set_ylim([0, 300])
axes[0].set_ylim([0, 300])
for i in range(4):
axes[i].set_xticks(range(len(labels)), labels=labels)
axes[i].set_ylabel('instances count')
plt.setp(axes[i].get_xticklabels(), fontsize=10, rotation=30)
axes[3].set_ylabel('proportion difference (%)')
visualizer2 = ClassBalance(labels=labels, ax = axes[1])
visualizer2.fit(np.array(test_labels)) # Fit the data to the visualizer
axes[1].set_title( 'test class balance' )
visualizer3 = ClassBalance(labels=labels, ax = axes[0])
visualizer3.fit(np.array(train_labels)) # Fit the data to the visualizer
axes[0].set_title( 'train class balance' )
visualizer4 = ClassBalance(labels=labels, ax = axes[2])
visualizer4.fit(np.array(train_labels), np.array(test_labels))
axes[2].legend(['train', 'test'])
axes[2].set_title( 'balance comparison' )
raw_unbalance = _get_data_relative(train_labels, labels) - _get_data_relative(test_labels, labels)
sns.barplot(x = labels, y = raw_unbalance, ax = axes[3])
axes[3].set_title( 'class relative imbalance' )
### PIE CHART USING SNS ###
train = _get_data_p(train_labels, labels)
main_axs[1][0].pie(train, labels = labels, autopct='%.0f%%', startangle = 90, textprops={'color':"w"})
test = _get_data_p(test_labels, labels)
main_axs[1][1].pie(test, labels = labels, autopct='%.0f%%', startangle = 90, textprops={'color':"w"})
size = 0.1
both = _get_nested_data(train_labels, test_labels, labels)
cmap = plt.colormaps["tab20c"]
inner = cmap([i%2+15 for i in range(len(both.flatten()))])
main_axs[1][2].pie(both.sum(axis = 1), labels = labels, autopct='%.0f%%', startangle = 90, textprops={'color':"w"})
main_axs[1][2].pie(both.flatten(), radius = 1-size, startangle = 90, wedgeprops=dict(width=size, edgecolor='w'), colors = inner, textprops={'color':"w"})
test_split_path = mpatches.Patch(color=cmap(16), label='test')
train_split_path = mpatches.Patch(color=cmap(15), label='train')
main_axs[1][2].legend(handles = [test_split_path, train_split_path])
main_axs[1][3].set_title('contribution to train-test imbalance')
main_axs[1][3].pie(np.abs(raw_unbalance), labels = labels, startangle = 90)
fig.suptitle('Dataset label distribution')
plt.savefig('../vis/dataset_distribution.png')
plt.show()
take_one = lambda y: random.choice([x for x in train_images_filenames if y in x])
examples = [take_one(label) for label in labels]
wedges, texts, _ = plt.pie(train, labels = labels, autopct='%.0f%%', startangle = 90, labeldistance=1.2, wedgeprops = { 'linewidth': 2, "edgecolor" :"k","fill":False, }, textprops={'color':"b"})
for n, wedge in enumerate(wedges):
img_to_pie(examples[n], wedges[n], xy=(0, 0), zoom=1 )
wedges[i].set_zorder(10)
plt.title('Dataset Labels Distribution')
plt.show()
Note that the dataset is almost perfectly balanced, resuting in around 200-300 images for training each of the 8 categories and ~100 for testing. The maximum unbalance is provided by images from the "coast" class with only a 1.5% of relative unbalance.
We denote the relative unbalance $r$ as the deviation with respect a perfect balance assuming a uniform distribution.
Expected number of images for a category (sampled from uniform distribution):
$E[c] = \frac{I_{dataset}}{N_{classes}}$
Where, therefore, the unbalance is:
$U(c) = E[c] - |c_{observed}|$
viz_dataset(train_images_filenames, train_labels, 4)
viz_dataset(test_images_filenames, test_labels, 4)
Last week we compared KAZE and SIFT descriptors, both in their vanilla and dense formulations. We found dense SIFT to be the clear winner, in terms of accuracy. Therefore this week we will stick to dense SIFT as our descriptor of choice. Nevertheless, last week we did not explore the effect that the step size (distance between keypoints), and the scale (how big are the keypoints) have on the peformance of the system. This week we investigate these two parameters.
This week several classifiers will be studied: besides the Support Vector Machine (SVM), we consider KNeighborsClassifier, LogisticRegression, RandomForestClassifier, and GaussianNB.
We also implement spatial pyramids, which allow to take into account the spatial layout of features in an image. In addition to the more common grid splitting, we also implement a horizontal splitting scheme, since the dataset contains mostly scenes from horizontal landscapes.
We include normalization of the features: L2, Power-Norm and Standardization.
Finally, as in previous weeks, we cross-validate the models, with accuracy as the metric of choice.
Additionally, we implement and test Fisher vectors, which allow us to obtain a more representative/expressive codebook, by accounting also for the variance of the clusters.
As we have seen in Section 1, the dataset is balanced. Therefore, vanilla accuracy is a robust metric for evaluating our models. Thus, we choose it to asses our models and to perform cross-validation.
As stated above, Dense SIFT is last week's strongest keypoint descriptor with a considerable margin. Thus we adopt it for this week.
Dense SIFT is implemented by computing SIFT descriptors in an equally-spaced grid of points on the image:
from visutils import plot_dense_keypoints
from utils import create_dense_kp
# Sample an image to plot
image = train_images_filenames[9]
# Create cv2 SIFT descriptor
sift = cv2.SIFT_create()
# Dense SIFT
img = cv2.imread(image)
gray_img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
dense_kp = create_dense_kp(gray_img.shape, step_size=10)
_,dense_sift_des = sift.compute(gray_img, dense_kp)
# Show dense keypoints with different steps size
steps = [10, 15, 20, 30, 40]
dense_kps = []
for step in steps:
dense_kps.append(create_dense_kp(gray_img.shape, step_size=step))
plot_dense_keypoints(image, kps=dense_kps)
That is, Dense SIFT uses a grid of spatially equidistant keypoints.
Here we tune the step_size parameter, which corresponds to the horizontal and vertical distance between keypoints.
Notice however how dense SIFT is not invariant to scale. Vanilla SIFT is scale covariant, because it detects keypoints at different scales. For this reason, we try to emulate this behaviour in dense SIFT by introducing the n_sizes parameter. Thus, for each size (from 1 to n_sizes), we create half the keypoints as in the previous level, but doubling the keypoint's scale. This allows to introduce information at several scales.
We now explore the effect that the step size and the number of levels have on the model's performance.
First of all, we fix a n_sizes = 1 (so we don't take scale into account) and try several step_size values. We compute the accuracy with 7 fold cross-validation.
We are also employing codebook size = 256 , along with an SVM classifier (with RBF kernel). No normalization nor pyramids are applied.
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=10,
n_sizes=1,
codebook_size=256,
distance_metric='l2',
classifier_class=SVC,
classifier_parameters = {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'},
)
params_step_size = list(range(1, 51, 4)) # fins a 50
if not os.path.exists('./results/denseSIFTstepSize.csv'):
res_step_size = run_model_with_params(clf, 'step_size', params_step_size, train_dataset, verbose=True, n_jobs=4, cv=7,)
clear_output()
result_dict = []
for i, res in enumerate(res_step_size):
result_dict.append({'step_size': res[0], 'mean_fit_time': round(res[1]['fit_time'].mean(), 2), 'mean_test_score': round(res[1]['test_score'].mean(), 2),})
df_sift = pd.DataFrame.from_dict(result_dict)
df_sift.to_csv('./results/denseSIFTstepSize.csv')
else:
df_sift = pd.read_csv('./results/denseSIFTstepSize.csv')
df_sift.head(100)
| Unnamed: 0 | step_size | mean_fit_time | mean_test_score | |
|---|---|---|---|---|
| 0 | 0 | 1 | 245.09 | 0.79 |
| 1 | 0 | 5 | 191.49 | 0.80 |
| 2 | 1 | 9 | 132.27 | 0.78 |
| 3 | 2 | 13 | 128.66 | 0.75 |
| 4 | 3 | 17 | 120.42 | 0.73 |
| 5 | 4 | 21 | 125.05 | 0.72 |
| 6 | 5 | 25 | 125.10 | 0.71 |
| 7 | 6 | 29 | 120.41 | 0.69 |
| 8 | 7 | 33 | 111.54 | 0.69 |
| 9 | 8 | 37 | 110.62 | 0.69 |
| 10 | 9 | 41 | 111.50 | 0.67 |
| 11 | 10 | 45 | 84.93 | 0.67 |
| 12 | 11 | 49 | 87.59 | 0.67 |
violin_plot(df_sift, ['step_size'], [5], ['step size'])
param_pca_perc not found in dataframe
The pattern is clear: the smaller the step size, the higher the accuracy.
Intuitively, this makes sense, as the smaller the step size, the more keypoints we have, and thus the smaller the keypoint scale is. This implies that we have more detailed information of the image. Despite this information being more local (as the scale is smaller), these more localized but denser features seem to yield better results.
Notice how the accuracy does not improve from step size 5 to 1, as the accuracy is 80% and 79%, respectively. This means that reducing the step size improves accuracy only up to a certain point, at step_size = 5, and then it stabilizes.
Notice also how as the step size is reduced, the computational cost increases. Thus, there is a trade-off between the training time and the accuracy of the model.
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=4,
n_sizes=1,
codebook_size=256,
distance_metric='l2',
classifier_class=SVC,
classifier_parameters = {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'},
)
params_n_sizes = list(range(1, 10, 1))
if not os.path.exists('./results/denseSIFTnSizes.csv'):
res_n_sizes = run_model_with_params(clf, 'n_sizes', params_n_sizes, train_dataset, verbose=True, n_jobs=4, cv=7,)
clear_output()
result_dict = []
for i, res in enumerate(res_n_sizes):
result_dict.append({'n_sizes': res[0], 'mean_fit_time': round(res[1]['fit_time'].mean(), 2), 'mean_test_score': round(res[1]['test_score'].mean(), 2),})
df_n_sizes = pd.DataFrame.from_dict(result_dict)
df_n_sizes.to_csv('./results/denseSIFTnSizes.csv')
else:
df_n_sizes = pd.read_csv('./results/denseSIFTnSizes.csv')
df_n_sizes.head(100)
| Unnamed: 0 | n_sizes | mean_fit_time | mean_test_score | |
|---|---|---|---|---|
| 0 | 0 | 1 | 174.68 | 0.80 |
| 1 | 1 | 2 | 244.56 | 0.80 |
| 2 | 2 | 3 | 276.25 | 0.81 |
| 3 | 3 | 4 | 325.07 | 0.80 |
| 4 | 4 | 5 | 358.64 | 0.80 |
| 5 | 5 | 6 | 328.46 | 0.81 |
| 6 | 6 | 7 | 368.50 | 0.81 |
| 7 | 7 | 8 | 360.88 | 0.81 |
| 8 | 8 | 9 | 435.54 | 0.80 |
We cannot observe a significant increase in accuracy when when we increase or decrease the number of levels. This means that for dense SIFT, the scale is not that important, at least for the dataset at hand.
This is in line with the conclusion we have just drawn regarding the step size: more abundant and more localized keypoints produce better results. We can conclude that sacrficing the scale-covariance from vanilla SIFT when switching to dense SIFT is not a real problem. Thus, there is no reason to prefer a n_sizes different from 1.
It is always recommended to normalize data so that all features have the same scale. This is known to lead to more robust training and overall better results. In this case, we normalize the histograms of visual words using L2 norm, Power norm, or StandardScale (standardization):
unctionTransformer and our custom function.Normalizer(norm='l2').StandardScaler.where $\mu$ and $\sigma$ are the mean and standard deviation of the training set, respectively.
study = pickle.load(open('__optuna_study_norm.pkl','rb'))
optuna.visualization.plot_slice(study, params=["Normalizers",], target_name='Accuracy')
We can see that all normalization schemes have a similar effect on the accuracy.
The best one seems to be L2 norm, which is the most common normalization scheme, with a small advantage. However, we can see that the difference between the three is barely significant. This means that the normalization scheme is not a crucial factor in the performance of the system.
The Bag of Visual Words (BoVW) approach sucesfully aggregates local information into a single global vector. Nevertheless, it does not account for the spatial disposition of the features within the image. We can alleviate this limitation with Spatial Pyramids: we partition the image into increasingly finer subregions, and generate a histogram of visual words within each of these smaller chunks. This approach considers the spatial layout of the features and provides more information than the BoVW alone. This enhances the performance of the visual recognition system.
Grid-like splitting is a method of partitioning an image into multiple smaller regions, or sub-regions, in a grid-like fashion. For each level of the pyramid, we split the image into a grid of $n \times n$ sub-regions. For each sub-region, we compute the histogram of visual words. This is repeated for each level of the pyramid, and the histograms are concatenated to form a single vector.
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def plot_image_divisions(image, level=1, color = '#fff', linewidth = 2):
fig, axs = plt.subplots(1,level, figsize=(15, 5))
for i, ax in enumerate(axs.ravel()):
level_factor = 2 ** (i+1)
ax.imshow(image)
ax.axis('off')
cell_height = int(image.shape[0] / level_factor)
cell_width = int(image.shape[1] / level_factor)
for row in range(level_factor):
shift_height = row * cell_height
for col in range(level_factor):
shift_width = col * cell_width
rect = patches.Rectangle((shift_width,shift_height),cell_width-linewidth,cell_height-linewidth,linewidth=linewidth,edgecolor=color,facecolor='none')
ax.add_patch(rect)
plt.show()
image = train_images_filenames[9]
img = cv2.imread(image)
plot_image_divisions(img, level=3)
In addition, we also split images into horizontal sub-regions. This makes sense as most images in the dataset come from horizontal landscapes.
Here's a breakdown of the process:
This approach takes into account the specific characteristics of the dataset, which is composed of landscape images, and thus it's better suited for analyzing and classifying them. By dividing the image into horizontal sub-regions, we can extract more information from the image, and get more accurate results.
def plot_horizontal_divisions(image, level=1, color = '#fff', linewidth = 2):
fig, axs = plt.subplots(1,level, figsize=(15, 5))
for i, ax in enumerate(axs.ravel()):
level_factor = 3 * (i + 1)
ax.imshow(image)
ax.axis('off')
cell_height = int(image.shape[0] / level_factor)
for row in range(level_factor):
shift_height = row * cell_height
rect = patches.Rectangle((0,shift_height),image.shape[1]-1,cell_height+linewidth,linewidth=linewidth,edgecolor=color,facecolor='none')
ax.add_patch(rect)
plt.show()
# visualize horizontal splitting
img_blocks = plot_horizontal_divisions(img, level=3)
In this section, we will be exploring the effect of varying the number of pyramid levels and pyramid type. Specifically, we will be using the spatial pyramid technique and comparing the results obtained using a square grid-like splitting versus a horizontal splitting. We will be varying the number of levels in the pyramid and analyzing the effect on the accuracy.
if os.path.exists('./_optuna_study_square.pkl'):
study = pickle.load(open('_optuna_study_square.pkl','rb'))
optuna.visualization.plot_slice(study, params=["pyramid_type", "pyramid_levels"], target_name='Accuracy')
We can observe that the horizontal splitting yields better results than the grid-like splitting. This is expected, as the horizontal splitting is more suited for the dataset at hand. It is logical, as the images are landscapes and the horizontal splitting may better capture the horizontal structure of the image. We can also observe that the accuracy increases as the number of pyramid levels increases, as the representation of the image becomes more detailed. However, the increase is not significant, and the computational cost increases significantly. Thus, we can conclude that 2 pyramid levels is a good compromise between accuracy and computational cost.
To reduce computational time, we can realy on PCA to reduce the feature space dimensionality, by projecting it onto a lower dimensional space. Last week we extensively studied the performance of our system in combination with PCA and LDA. This week, we explore PCA once more, as it can be useful whne using spatial pyramids and the feature vector is much bigger.
Since the dimension of the original space from which the projected vectors originate also affects the optimal projection space, we explore both the codebook size and the PCA dimension.
codebook_size = [1, 2, 16, 64, 128, 224, 512, 1024]
pca_dims = [1, 2, 16, 64, 128, 224, 512, 1024]
if os.path.exists('../vis/hm_matrix.pkl') and os.path.exists('../vis/time_matrix.pkl') :
matrix = pickle.load(open('../vis/hm_matrix.pkl', 'rb'))
time_matrix = pickle.load(open('../vis/time_matrix.pkl', 'rb'))
else:
warnings.filterwarnings("ignore")
matrix = np.zeros((len(codebook_size), len(pca_dims)))
time_matrix = np.zeros((len(codebook_size), len(pca_dims)))
for n, csize in enumerate(codebook_size):
for m, pdim in enumerate(pca_dims):
if pdim > csize: continue
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True, # dense does codebook_sizework with KAZE descriptor
n_keypoints=1000,
codebook_size=csize,
distance_metric='l2',
dim_reducer_type='PCA',
pca_perc = pdim # 'PCA' or 'LDA'
)
res = run_model_with_params(clf, 'codebook_size', [csize], train_dataset, verbose = True)
matrix[n, m] = res[0][1]['test_score'].mean()
time_matrix[n, m] = res[0][1]['fit_time'].mean()
pickle.dump(matrix, open('../vis/hm_matrix.pkl', 'wb'))
pickle.dump(time_matrix, open('../vis/time_matrix.pkl', 'wb'))
Applying PCA does not increase the performance in any sense, therefore using PCA should be convinient for archieving better training and inference times. PCA-Reduced components never show more significance than the original set; this points out to the intuition that there's no useless features in our histograms, i.e. there's no visual words giving non-correlated or redundant information.
In the following figure, we can observe how PCA's performance is capped by the model perforance, meaning that PCA won't outperform it.
# Plot confusion matrix PCA dim - Codebook size
cmap = sns.cubehelix_palette(start=1.6, light=0.8, as_cmap=True, reverse=True)
fig, axs = plt.subplots(ncols = 2, figsize = (16, 6) )
ax = sns.heatmap(matrix, annot=True, cmap = cmap, mask = matrix==0, ax = axs[0])
ax2 = sns.heatmap(time_matrix / 10, annot=True, cmap = cmap, mask = matrix==0, ax = axs[1])
axs[0].set_ylabel('Size of the codebook')
axs[0].set_xlabel("PCA Reduction")
axs[1].set_xlabel("PCA Reduction")
ax.set_yticklabels(codebook_size)
ax.set_xticklabels(pca_dims)
ax2.set_yticklabels(codebook_size)
ax2.set_xticklabels(pca_dims)
axs[0].set_title("PCA/#VWords Performance (acc [%])")
axs[1].set_title("PCA/#VWords Performance (time [10s])")
plt.plot()
[]
Performance in terms of accuracy (left, %) and time (right, deca-seconds) with respect to the size of the codebook (vertical axis) and the PCA number of components (horizontal). Note that the region where $PCA_{Components} > |VisualWords|$ is not defined since PCA cannot generate more dimensions than the current ones.
So far, we've been training and testing our model with a KNN-based classifier. Nevertheless, other alternatives are possible. In this section, we will explore other options such as using an SVM approach. This change shall give the model greater capabilities in terms of hyperspace separation. However, most of the time data is not linearly separable, therefore we will show the performance of the model using different kernel functions for the SVM.
Before exploring the effect of the kernel, let us find the most suitable set of parameters for this data with the most basic kernel (linear).
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=10,
codebook_size=128,
distance_metric='l2',
classifier_class=SVC,
scaler_class=StandardScaler,
classifier_parameters = {}
)
params_c = [{'C': 10**x, 'kernel': 'linear'} for x in [1, 2, 3, -2, -3, -4]]
params_deegree = [{'shrinking': x, 'kernel': 'linear'} for x in [True, False]]
params_weight = [{'class_weight': x, 'kernel': 'linear'} for x in ['balanced', None]]
if not os.path.exists('./results/svm_no_kernel.csv'):
res_c = run_model_with_params(clf, 'classifier_parameters', params_c, train_dataset, verbose = True, n_jobs = 4, cv = 7,)
res_degree = run_model_with_params(clf, 'classifier_parameters', params_deegree, train_dataset, verbose = True, n_jobs = 4, cv = 7,)
res_weight = run_model_with_params(clf, 'classifier_parameters', params_weight, train_dataset, verbose = True, n_jobs = 4, cv = 7,)
clear_output()
if not os.path.exists('./results/svm_no_kernel.csv'):
## Save to dataframe
columns = ['mean_test_score', 'mean_fit_time', 'C', 'shrinking', 'class_weight']
default = [None, None, 1, True, None] # For setting when not changing
rows = []
### res-c data
for result in res_c:
value = result[0]['C']
for time, score in zip(result[1]['fit_time'], result[1]['test_score']):
rows.append([score, time, str(float(value)), None, None])
for result in res_degree:
value = result[0]['shrinking']
for time, score in zip(result[1]['fit_time'], result[1]['test_score']):
rows.append([score, time, None, 'heuristic' if value else 'no heuristic', None])
for result in res_weight:
value = result[0]['class_weight']
for time, score in zip(result[1]['fit_time'], result[1]['test_score']):
rows.append([score, time, 1, None, value if isinstance(value, str) else 'no-weights'])
# Load df
df = pd.DataFrame(rows, columns=columns)
df['param_pca_perc'] = [0]*len(df)
df.to_csv('./results/svm_no_kernel.csv')
else:
dtypes = {'mean_test_score': float, 'mean_fit_time': float, 'C': str, 'shrinking': str, 'class_weight': str}
df = pd.read_csv('./results/svm_no_kernel.csv', dtype = dtypes)
#df = df.where(df==df, other=None)
violin_plot(df, ['C', 'shrinking', 'class_weight'], [None, None, None], ['C', 'Heuristic', 'Weights'])
/home/adri/Desktop/master/M3/M3---ML-in-CV/week2/visutils.py:163: UserWarning: FixedFormatter should only be used together with FixedLocator axs[2][n].set_xticklabels(list(valueslut.keys())) /home/adri/Desktop/master/M3/M3---ML-in-CV/week2/visutils.py:163: UserWarning: FixedFormatter should only be used together with FixedLocator axs[2][n].set_xticklabels(list(valueslut.keys()))
From this brief results (7-folds per parameter) we can observe that there isn't any notable difference in the accuracy. Nevertheless there is a clear insight that a big regularization term $C$ leads to slow convergence and slighly worse results. This has been done through an ablation study fixing the rest of the parameters with a linear kernel; therefore the non-unicity of some distributions may be the cause of the low stability of the method or the low significance in the differences.
As we've shown in the previous result, SVMs seems to be very stable when using the same kernel (linear). In this section we will show the differences in performance between five different kernel functions: linear, Polynomial, RBF , Sigmoid and Histogram Intersection. Since we are dealing with histograms of visual words, we expect the intersection kernel to perform reasonably well, and this is indeed the case in our experiments.
# you need to define a callable function to pass the histogram intersection kernel to svm.SVC
def histogramIntersection(data_1, data_2):
kernel = np.zeros((data_1.shape[0], data_2.shape[0]))
for d in range(data_1.shape[1]):
column_1 = data_1[:, d].reshape(-1, 1)
column_2 = data_2[:, d].reshape(-1, 1)
kernel += np.minimum(column_1, column_2.T)
return kernel
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=10,
codebook_size=128,
distance_metric='l2',
classifier_class=SVC,
classifier_parameters = {},
scaler_class = None
)
params_kernel = [{'kernel': x} for x in ["linear", "poly", "rbf", "sigmoid", histogramIntersection]]
if not os.path.exists('./results/svm_kernel.csv'):
res_params = run_model_with_params(clf, 'classifier_parameters', params_kernel, train_dataset, verbose = True, n_jobs = 4, cv = 5,)
clear_output()
if not os.path.exists('./results/svm_kernel.csv'):
## Save to dataframe
columns = ['mean_test_score', 'mean_fit_time', 'kernel',]
rows = []
### res-c data
for result in res_params:
value = result[0]['kernel']
for time, score in zip(result[1]['fit_time'], result[1]['test_score']):
rows.append([score, time, str(value)])
# Load df
df = pd.DataFrame(rows, columns=columns)
df['param_pca_perc'] = [0]*len(df)
df.to_csv('./results/svm_kernel.csv')
else:
df = pd.read_csv('./results/svm_kernel.csv',)
# violin_plot(df, ['kernel',], [None,], ['kernel', ])
Regarding the linear kernel, we can observe that we're obtaining worse results (recall that in the plot above we show 1 - accuracy).
In our dataset, images of different classes share visual words (e.g. trees in both forests and open country classes), therefore our dataset is not linearly separable and it doesn't make sense to use a linear classifier.
In this case, it is recommended to use the RBF kernel, which creates non-linear combinations of the features to uplift the samples onto a higher-dimensional feature space where a linear decision boundary can be used. And indeed, we obtain the best accuracy with this kernel.
We can certify that the histogram interesection kernel that we implemented can be used for SVM, as we get good results using it. This is because this kernel is useful in our specific problem, as we're using histograms as features. However, it stills performs worse than sigmoid and RBF kernels. For this reason, we'll use RBF kernel.
Additionally, we provide a brief summarization of the results in terms of the used classifier. Further research will be done in a posterior global optiization, nevertheles, we start from this baseline.
## DIfferent models
import warnings
warnings.filterwarnings('ignore')
# Best parameters last week
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=10,
codebook_size=128,
distance_metric='l2',
classifier_class=SVC,
)
print(clf.scaler_class)
models = [SVC, KNeighborsClassifier, LogisticRegression, RandomForestClassifier, GaussianNB]
if os.path.exists('../vis/classifiers.pkl',):
class_results = pickle.load(open('../vis/classifiers.pkl', 'rb'))
else:
class_resuls = run_model_with_params(clf, 'classifier_class', models, train_dataset, verbose = True)
pickle.dump(class_resuls, open('../vis/classifiers.pkl', 'wb'))
standardscaler
colums = ['Classifier', 'Test Accuracy', 'Accuracy STD', 'Fit Time']
rows = [[name, x['test_score'].mean(), x['test_score'].var() ** .5, x['fit_time'].mean(),] for name, (_, x) in zip(['SVM', 'KNN', 'Logistic', 'Random Forest', 'Naive Bayes'], class_results)]
pd.DataFrame(rows, columns=colums)
| Classifier | Test Accuracy | Accuracy STD | Fit Time | |
|---|---|---|---|---|
| 0 | SVM | 0.738969 | 0.008270 | 16.453848 |
| 1 | KNN | 0.677299 | 0.012100 | 16.475282 |
| 2 | Logistic | 0.689527 | 0.021052 | 16.384468 |
| 3 | Random Forest | 0.717172 | 0.014285 | 16.590544 |
| 4 | Naive Bayes | 0.620415 | 0.027716 | 17.176783 |
Given the low variance of the methods on the cross validation folds, we assume stable solutions. In this sense, we can observe that all algorithms perform reasonably well, unsurprisingly SVM outperforms the rest. Very close to SVM's results performs the random forest ensembling approach. With a very low significance, KNN and Logistic regresion are the worse, but as we will observe this is just on this specific case and parameters. This low significance will led this ranking to re-arange when globally opimizing. In any case, Naive Bayes performs poorly with high significance even in comparison to KNN; therefore for the sake of time performance it will not be globally optimized.
Given the big ammount of parameters to optimize; we will be using optuna for parameter search.
First, let us define the target function for the search
Warning! Since optuna by default minimizes the target function, we are optimizing 1 - accuracy (namely accuracy loss)
Due to performance reasons, we will not include the pyramid representation parameter in the global search. It's already demonstrated that the number of pyramids levels and the shape of them benefits the model but, for the sake of time performance, we will not be including it here. Consider that with no spatial pyramid, it already takes around 5 minutes to validate a single model. Note also that in the most naive pyramid representation, the number of histograms scale exponentially with respect to the number of levels $O(2^L)$ and, since optuna is already kind of slow, we cannot handle this much of workload in a reasonable time of execution. For the final results we will include the pyramid representation with the number of levels discussed above.
def target(trial):
### FIRST, GLOBAL PARAMETERS ###
descriptor_name = trial.suggest_categorical('descriptor', ['SIFT', 'KAZE'])
if descriptor_name == 'SIFT': dense = trial.suggest_categorical('dense', [True, False])
else: dense = False
#num_pyramid = trial.suggest_int('pyramid-levels', 1, 4)
if dense:
step_size = trial.suggest_int('step-size', 5, 30)
n_keypoints = trial.suggest_int('n-keypoints', 10, 1000)
else:
step_size = None
n_keypoints = None
codebook_size = trial.suggest_int('codebook-size', 10, 512)
distance_metric = trial.suggest_categorical('distance metric', ['l1', 'l2', 'cosine'])
dim_reducer_type = trial.suggest_categorical('dim-reducer', ['PCA', 'LDA', None])
if dim_reducer_type is not None:
if dim_reducer_type == 'PCA':
pca_perc = trial.suggest_float('pca %', 0, 1)
n_components = None
else:
pca_perc = None
n_components = trial.suggest_int('lda dims',1, 7)
else: n_components, pca_perc = None, None
norm = trial.suggest_categorical('Normalizers', ['standardscaler', 'l2', 'powernorm', 'none'])
### NOW HARD PART, THE MODELS ###
model_names = {'svm': 0, 'knn': 1, 'logistic': 2, 'forest': 3,}
available_models = [SVC, KNeighborsClassifier, LogisticRegression, RandomForestClassifier,]
model = trial.suggest_categorical('classifier', list(model_names.keys()))
if model == 'svm':
kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'inter']
k = trial.suggest_categorical('svm-kernel', kernels)
params = {'kernel': histogramIntersection if k=='inter' else k , 'C': 1 ** trial.suggest_int('svm log(C)',-4, 4)}
if k == 'poly':
params['degree'] = trial.suggest_int('svm poly degree', 2, 10)
elif model == 'knn':
params = {'n_neighbors': trial.suggest_int('knn-neighs', 1, 20),
'weights': trial.suggest_categorical('knn weights', ['distance', 'uniform']),
'metric':distance_metric}
params['n_jobs'] = 4
elif model == 'forest':
params = {'n_estimators': 10 ** trial.suggest_int('Random Forest log10(N-Estimators)', 1, 3),
'criterion': trial.suggest_categorical('RF criterion', ['gini', 'entropy', 'log_loss'])} # I don't think there's a need of going forward LoL
params['n_jobs'] = 4
else:
params = {'penalty': trial.suggest_categorical('logistic pentalty', ['l2', None, 'l1',]),
'C': 1 ** trial.suggest_int('logistic log(C)',-5, -1)}
params['n_jobs'] = 4
params['solver'] = 'saga'
clf = ClassifierBoVW(
descriptor=descriptor_name,
dense=dense,
pyramid_levels = 1, # Pyramids tampoc va
step_size=step_size,
n_keypoints=n_keypoints,
codebook_size=codebook_size,
distance_metric=distance_metric,
classifier_class=available_models[model_names[model]],
classifier_parameters = params,
scaler_class=norm)
#scores = cross_validate(clf, train_dataset.images, train_dataset.labels, scoring = 'accuracy', cv= 5, n_jobs=6)
#print(scores)
#results = run_model_with_params(clf, 'dense', [dense], train_dataset, verbose = True, n_jobs = 6, cv = 5,)[0][1]['test_score']
clf.fit(train_dataset.images, train_dataset.labels)
preds = clf.predict(test_dataset.images)
acc = [x==y for x,y in zip(preds, test_dataset.labels)]
clear_output()
return 1 - sum(acc)/len(acc)
fname = './optuna_study_no_pyramid.pkl'
if os.path.exists(fname):
study = pickle.load(open(fname,'rb'))
else:
study = optuna.create_study(storage="sqlite:///db.sqlite3", study_name="M3 - W2 - No Pyramid") # Create a new study.
study.optimize(target, n_trials=224) # Invoke optimization of the objective function.
pickle.dump(study, open(fname, 'wb'))
With optuna, we can observe the individual behaviour of each parameter. First, as optuna searches for the best set of parameters with a sort of crawling/graph-based search, we can visualize the main parameters that did the greatest contribution to the results.
fig = optuna.visualization.plot_param_importances(study)
fig.write_image('./optuna-vis/importance.png')
#fig.show()
We can see that the most important parameters are the classifiers and the number of visual words. The rest of the parameters are not as important as the previous ones, but they still contribute to the results.
Another useful visualization this module provides, is exploring the most promising path to follow when optimizing. Note that this visualization can only be done with complete paths, therefore we will have to select only those parameters common to all the posible combinations, which, on the other hand, are many.
fig = optuna.visualization.plot_parallel_coordinate(study, params=["codebook-size", "descriptor", "dim-reducer", "distance metric"])
fig.write_image('./optuna-vis/parallel.png')
#fig.show()
It can be readily seen that size matters when it comes to codebook size.
In addition to this week's parameters, we also were interested in revalidating our initial choice for SIFT descriptors. It's easy to see the clear dominance of SIFT with respect KAZE. As we already commented, there's no good reason to use dimensionality reduction other than time performance but, if used, PCA shows a very good connection with manthattan distance (l1). Additionally, for the parameters that cannot be always connected we can set "slices". For instance, let us observe how each classifier behaved.
fig = optuna.visualization.plot_slice(study, params = ['classifier'])
fig.write_image('./optuna-vis/slices.png')
#fig.show()
As it was mentioned preiously, a global optimization can lead to re-arrangements on the priorities for several parameters. Thus, we observe that the best performance has been oobtained by SVM classifier, followed very close by a logistic one. Here, we expose a discussion about the best parameters for every classifier. We present a table with the top 5 performing classifiers for each category, sorted by global performance.
colums = ['Ranking', 'Classifier', 'Test Accuracy', 'C', 'Kernel', 'Keypoints', 'Codebook Size']
rows = [
[1, 'SVM', .84, 1, 'intersection', 89, 430],
[2, 'SVM', .84, 1, 'intersection', 39, 430],
[3, 'SVM', .84, 1, 'intersection', 93, 411],
[4, 'SVM', .84, 1, 'intersection', 54, 411],
[5, 'SVM', .84, 1, 'intersection', 54, 411],
]
pd.DataFrame(rows, columns=colums, )
| Ranking | Classifier | Test Accuracy | C | Kernel | Keypoints | Codebook Size | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | SVM | 0.84 | 1 | intersection | 89 | 430 |
| 1 | 2 | SVM | 0.84 | 1 | intersection | 39 | 430 |
| 2 | 3 | SVM | 0.84 | 1 | intersection | 93 | 411 |
| 3 | 4 | SVM | 0.84 | 1 | intersection | 54 | 411 |
| 4 | 5 | SVM | 0.84 | 1 | intersection | 54 | 411 |
Interestingly, those results don't show any significance change in performance, but there's a big gap in the keypoints and codebook size. It may point out to the fact that SVM doesn't really care about outliers but just on the points coliniear to the support vectors.
colums = ['Ranking', 'Classifier', 'Test Accuracy', 'C', 'Regularizer', 'Keypoints', 'Codebook Size']
rows = [
[12, 'Logistic', .83, 5e-5, 'Lasso', 344, 466],
[21, 'Logistic', .83, 5e-5, 'Lasso', 303, 481],
[33, 'Logistic', .82, 5e-5, 'Lasso', 238, 453],
[37, 'Logistic', .82, 5e-5, 'Lasso', 261, 462],
[38, 'Logistic', .82, 5e-5, 'Lasso', 314, 505],
]
pd.DataFrame(rows, columns=colums)
| Ranking | Classifier | Test Accuracy | C | Regularizer | Keypoints | Codebook Size | |
|---|---|---|---|---|---|---|---|
| 0 | 12 | Logistic | 0.83 | 0.00005 | Lasso | 344 | 466 |
| 1 | 21 | Logistic | 0.83 | 0.00005 | Lasso | 303 | 481 |
| 2 | 33 | Logistic | 0.82 | 0.00005 | Lasso | 238 | 453 |
| 3 | 37 | Logistic | 0.82 | 0.00005 | Lasso | 261 | 462 |
| 4 | 38 | Logistic | 0.82 | 0.00005 | Lasso | 314 | 505 |
Although being competitive results to SVM, the required ammunt of features to be efficiently trained is way bigger. Therefore it's still more convinient to use SVM as it can work with less keypoints.
Some parameters interact in a consistent way with each other; here we can visualize several of them.
fig = optuna.visualization.plot_contour(study, params = ['n-keypoints', 'codebook-size', 'pca %', 'Normalizers'])
fig.update_layout(
autosize=False,
width=800,
height=800,)
fig.write_image('./optuna-vis/contours.png')
As the SVM Classifier showed the best performance, here we observe a more deep visualization on its parameters.
fig = optuna.visualization.plot_slice(study, params = ['svm log(C)', 'svm-kernel'])
fig.write_image('./optuna-vis/slice-svm.png')
As expected, the most suitable kernel for this problem is the histogram intersection kernel. This is because the bag of words approach grants us a probabilistic visual representation of the data. interestingly, the log(C) parameter behaves like a normal distribution with center 0 (log(0) = 1 = C), which has been the most convinient margin for the SVM.
In this section we will evaluate the model on the test set. We will use the best parameters found in the previous section. The metric for evaluation is accuracy.
#3 posar bons hiperparams
clf = ClassifierBoVW(
descriptor='SIFT',
dense = True,
classifier_class=SVC,
classifier_parameters={'kernel': histogramIntersection, 'C': 1},
codebook_size=430,
dim_reducer_type=None,
distance_metric='l1',
step_size=5,
scaler_class=None
)
clf.fit(train_dataset.images, train_dataset.labels)
/home/adri/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
predictions = clf.predict(test_dataset.images)
test_accuracy = accuracy_score(test_dataset.labels, predictions)
print("test accuracy: ", test_accuracy)
test accuracy: 0.8141263940520446
Best parameters with Spatial Pyramid
In addition to the previous results, we will also show the results obtained with the spatial pyramid approach. As it's been said in the previous section, the spatial pyramid approach requieres a huge amount of computational power. Therefore, we will only show the results for the best parameters found with optuna in combination with the horizontal Spatial Pyramid with 2 levels (as it's the one that performs better).
clf = ClassifierBoVW(
descriptor='SIFT',
dense = True,
pyramid_type='horizontal',
pyramid_levels=2,
classifier_class=SVC,
classifier_parameters={'kernel': histogramIntersection, 'C': 1},
codebook_size=430,
dim_reducer_type=None,
distance_metric='l1',
n_keypoints=89,
step_size=5,
scaler_class=None)
clf.fit(train_dataset.images, train_dataset.labels)
predictions = clf.predict(test_dataset.images)
test_accuracy = accuracy_score(test_dataset.labels, predictions)
print("test accuracy: ", test_accuracy)
test accuracy: 0.7955390334572491
The results obtained with the test set are considerably higher than the ones obtained last week. The use of different classifiers and the use of optuna to find the best parameters has been very beneficial for the results.
Surprisingly, the accuracy of the Spatial Pyramid approach on the test set is slightly lower than the one obtained with the vanilla BoVW approach. This may be due to the fact that the spatial pyramid approach is more sensitive to the parameters and, therefore, the best parameters found with optuna are not the best for the spatial pyramid approach. It would be interesting to perform a global search also accounting for the spatial pyramid approach and its parameters.
The BoVW has been successful on the dataset at hand. So far, the approach we followed for obtaining the closest word in the codebook relies only on the amount of local descriptors falling into each cluster. Now, notice that we clustered with k-means, which assumes that clusters form a grid of Voronoi cells. This however, does not take into account that clusters may have different shapes along different directions. With Fisher vectors, we not only employ the mean of the local descriptors, but we also take into account higher order statistics, in particular, the covariances. This injects further information about how far is each feature from its corresponding closest codebook entry (as well as with respect to other vocabulary words).
Here we implement Fisher vectors and study how they peform on our datset. We utilize dense SIFT descriptors, with step_size=10.
import pickle as cPickle
if not os.path.exists('./results/train_sift_descriptors_dense.pkl') or not os.path.exists('./results/test_sift_descriptors_dense.pkl'):
clf = ClassifierBoVW(
descriptor='SIFT',
dense=True,
step_size=5,
codebook_size=256,
distance_metric='l2',
classifier_class=SVC,
scaler_class=StandardScaler,
classifier_parameters = {},
)
Tr_desc_dense = clf.compute_descriptors(train_dataset.images)
Test_descr_dense = clf.compute_descriptors(test_dataset.images)
def save_pickle(path, data):
"""
Save data to pickle file
"""
with open(path, 'wb') as f:
cPickle.dump(data, f)
save_pickle('./results/train_sift_descriptors_dense.pkl', Tr_desc_dense)
save_pickle('./results/test_sift_descriptors_dense.pkl', Test_descr_dense)
else:
def load_pickle(path):
"""
Retrieve pickle file containing ground truths of query images
"""
with open(path, 'rb') as f:
query_gt = cPickle.load(f)
return query_gt
Tr_desc_dense = load_pickle('./results/train_sift_descriptors_dense.pkl')
Test_descr_dense = load_pickle('./results/test_sift_descriptors_dense.pkl')
def Fisher_vector(X, GMM):
"""
Calculates a Fisher vector from the given descriptors
Params:
X: (N, D) or (D, )
descriptors
GMM: instance of sklearn mixture.GMM
Gauassian mixture model of X
Return
fisher_vector: array_like, (K + 2 * D * K, )
Contains derivatives w.r.t. the class-posterior weights, means and covariances) of X
"""
X = np.atleast_2d(X)
N = X.shape[0]
# Compute class-posterior probabilities.
q = GMM.predict_proba(X) # NxK
# Compute sufficient statistics of descriptors.
q_sum = np.sum(q, 0)[:, np.newaxis] / N
q_X = np.dot(q.T, X) / N
q_X_2 = np.dot(q.T, X ** 2) / N
# Compute derivatives with respect to mixing weights, means and variances.
d_pi = q_sum.squeeze() - GMM.weights_
d_mu = q_X - q_sum * GMM.means_
d_sigma = (
- q_X_2
- q_sum * GMM.means_ ** 2
+ q_sum * GMM.covariances_
+ 2 * q_X * GMM.means_)
# Fuse derivatives into a vector.
return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))
from tqdm import tqdm
from sklearn.mixture import GaussianMixture
def get_fisher_vectors(num_clusters_gmm, num_components_pca, Train_descriptors, Test_descriptors):
"""
Returns train and test fisher vectors after training a GMM.
Parameters
----------
num_clusters_gmm:
The number of mixture components to use in Gaussian Mixture Model
num_components_pca:
Number of components to use while using PCA on SIFT descriptors.
Train_descriptors:
List of SIFT descriptors per train image
Test_descriptors:
List of SIFT descriptors per test image
Returns
-------
(train_fisher_vectors, test_fisher_vectors)
Fisher vectors (derivatives with respect to the mixing weights, means
and variances) for all training and testing samples
"""
# We sample the descriptors to reduce their dimensionality by PCA and computing a GMM.
# For a GMM of size k, we need about 1500*k training descriptors
D=np.vstack(Train_descriptors)
indices = random.sample(range(0,D.shape[0]),num_clusters_gmm*1500)
sample = D[indices,:]
#apply PCA on them
pca = PCA(n_components=num_components_pca)
samplepca = pca.fit_transform(sample)
# Fit a Gaussian mixture model on them, and compute mean, covariance and weight matrix
GMM = GaussianMixture(n_components=num_clusters_gmm,covariance_type='diag')
GMM.fit(samplepca)
# Compute Fisher vectors
Train_descriptors_fisher = []
for train_descriptor in tqdm(Train_descriptors):
start = 0
length = train_descriptor.shape[0]
stop = start + length
train_descriptor_pca = train_descriptor[:,:num_components_pca]
start = stop
train_descriptor_fisher = Fisher_vector(train_descriptor_pca, GMM)
Train_descriptors_fisher.append(train_descriptor_fisher)
Dtest = np.vstack(Test_descriptors)
Test_descriptors_fisher = []
for test_descriptor in tqdm(Test_descriptors):
start = 0
length = test_descriptor.shape[0]
stop = start + length
test_descriptor_pca = test_descriptor[:,:num_components_pca]
start = stop
test_descriptor_fisher = Fisher_vector(test_descriptor_pca, GMM)
Test_descriptors_fisher.append(test_descriptor_fisher)
# used for normalizing the Fisher Vectors
def normalize_fisher(descriptors):
image_fvs = np.vstack(descriptors)
image_fvs = np.sign(image_fvs) * np.abs(image_fvs) ** 0.5
norms = np.sqrt(np.sum(image_fvs ** 2, 1))
image_fvs /= norms.reshape(-1, 1)
return image_fvs
train_fisher = normalize_fisher(Train_descriptors_fisher)
test_fisher = normalize_fisher(Test_descriptors_fisher)
return train_fisher, test_fisher
# Get Fisher Vectors with num_clusters_GMM = 32 and num_componenets_PCA = 32
train_fisher, test_fisher = get_fisher_vectors(64, 64, Tr_desc_dense, Test_descr_dense)
# Build a SVM Classifier
from sklearn import svm
classifier = svm.SVC()
100%|██████████| 1881/1881 [00:56<00:00, 33.34it/s] 100%|██████████| 807/807 [00:22<00:00, 35.69it/s]
classifier.fit(train_fisher, train_dataset.labels)
accuracy = 100 * classifier.score(test_fisher, test_dataset.labels)
print(accuracy)
76.82775712515489
We see how with this configuration, a model working with Fisher vectors already obtains a high accuracy of 76%. Nevertheless, this is not higher than previous models. It seems that Fisher vectors are not really that useful in the dataset, as it is rather easy, and there aren't that many classes.
It is important to keep in mind also that this alternative approach is way more computationally expensive. While with the simpler k-Means approach for building the codebook we were able to have as many as 1024 visual words, with Fisher vectors this value is reduced to less than 256. This is because GMM models take into account also second order statistics, not just the mean, and estimating those is more computationally expensive.
First, we study the number of PCA components while keeping the number of GMM clusters fixed to 32. We limit the number of pca components to 128, as this is the dimension of the features we employ for this experiment:
if os.path.exists('./optuna_study_fisher_pca_.pkl'):
study = pickle.load(open('optuna_study_fisher_pca_.pkl','rb'))
fig = optuna.visualization.plot_slice(study, params=['pca_components'], target_name='')
fig.write_image('./optuna-vis/slice-pca.png')
We also study the number of GMM clusters. We fix at 32 the number of PCA components. We tried with a number of GMM clusters equal to 256, but found that the algorithm did not converge, as the complexityof the algorithm grows very fast. So we explore a maximum of 128 clusters.
if os.path.exists('./optuna_study_fisher_GMM.pkl'):
study = pickle.load(open('optuna_study_fisher_GMM.pkl','rb'))
optuna.visualization.plot_slice(study, params=["GMM Components"], target_name='Accuracy')
Both a bigger number of PCA components and a number of GMM components lead to an increase in accuracy.
With 32 PCA components, the performance reaches a plateau. From that point, it seems that increasing the number of PCA components has a slightly negative effect.
In the case of GMM components, the performance does not really imporove after 32 components, so the extra computational cost is not worth it. This is in stark contrast with the codebook size from the simpler kMeans approach. While Fisher vectors are much more computationally expensive to compute, they are more expressive, so a much smaller number of these vectors is needed for obtaining similar results.
Let's start by plotting the confusion matrix to see the percentage of correct and wrong predictions for each class in the test set:
Confusion Matrix
For this visualization, we will be using the best parameters found using optuna with the SVM classifier.
#3 posar bons hiperparams
clf = ClassifierBoVW(
descriptor='SIFT',
dense = True,
classifier_class=SVC,
classifier_parameters={'kernel': histogramIntersection, 'C': 1},
codebook_size=430,
dim_reducer_type=None,
distance_metric='l1',
step_size=5,
scaler_class=None)
clf.fit(train_dataset.images, train_dataset.labels)
predictions = clf.predict(test_dataset.images)
test_accuracy = accuracy_score(test_dataset.labels, predictions)
print("test accuracy: ", test_accuracy)
/home/adri/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn(
test accuracy: 0.8141263940520446
# Prepare data for confusion matrix
axs_dict = {name: n for n, name in enumerate(set(test_labels))}
cat = len(axs_dict)
matrix = np.zeros((cat, cat))
for gt, pred in zip(test_labels, predictions):
matrix[axs_dict[gt], axs_dict[pred]] += 1
matrixrel = np.zeros((cat, cat))
for x in test_labels:
for y in predictions:
matrixrel[axs_dict[x], axs_dict[y]] = round(100 * matrix[axs_dict[x], axs_dict[y]] / matrix[axs_dict[x],].sum())
# Plot confusion matrix
cmap = sns.cubehelix_palette(start=1.6, light=0.8, as_cmap=True,)
fig, axs = plt.subplots(ncols = 2, figsize = (10, 4))
ax = axs[0]
sns.heatmap(matrix, annot=True, cmap = cmap, ax = ax, cbar = False)
ax.set_ylabel('GT')
ax.set_xlabel("Predicted")
ax.set_title("Absolute count")
ax.set_xticks(list(range(len(axs_dict))), rotation = 45) # Rotates X-Axis Ticks by 45-degrees
ax.set_yticks(list(range(len(axs_dict))), rotation = 45) # Rotates X-Axis Ticks by 45-degrees
ax.set_yticklabels(axs_dict, rotation = 20)
ax.set_xticklabels(axs_dict, rotation = 45)
sns.heatmap(matrixrel, annot=True, cmap = cmap, ax = axs[1], cbar = False, )
ax = axs[1]
ax.set_title("Relative count (%)")
ax.set_yticklabels([], rotation = 0)
ax.set_xticklabels(axs_dict, rotation = 45)
ax.set_xlabel("Predicted")
fig.suptitle('Confusion matrix for test set predictions')
plt.plot()
[]
Here we observe some reasonable errors such as confusion with coast/opencountry; and some less common such as confusing tallbuildings with open countries. You wouldn't expect many buildings in an open country.
Later on, we will observe this behaviours qualitatevely.
ROC Curve
For this visualization, we will be using the logistic approach instead of the SVM. As it's been shown in the global search section, both algorithms lead to competitive results and therefore are comparable.
The reason for doing so is that SVM (as is) cannot retrieve class probabilities and therefore is hard to stablish a monotonic function with respect to Positive rate.
#3 posar bons hiperparams
clf = ClassifierBoVW(
descriptor='SIFT',
dense = True,
classifier_class=LogisticRegression,
classifier_parameters={'penalty': 'l1', 'solver': 'saga'},
codebook_size=466,
dim_reducer_type=None,
distance_metric='l1',
step_size=5,
scaler_class=None)
clf.fit(train_dataset.images, train_dataset.labels)
res = [x==y for x,y in zip(clf.predict(test_dataset.images), test_dataset.labels)]
print(sum(res)/len(res))
/home/adri/anaconda3/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 3 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( /home/adri/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
0.80545229244114
classes = clf.classifier_model.classes_.tolist()
descriptors = clf.compute_descriptors(test_dataset.images)
descriptors = clf.normalize_descriptors(descriptors)
X_visual_words = clf.extract_visual_words(descriptors)
y_score = clf.classifier_model.predict_proba(X_visual_words)
predicted_labels = clf.classifier_model.predict(X_visual_words)
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay
label_binarizer = LabelBinarizer().fit(train_dataset.labels)
y_onehot_test = label_binarizer.transform(test_dataset.labels)
y_onehot_test.shape # (n_samples, n_classes)
class_of_interest = "coast"
n_classes = 8
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
y_test = test_dataset.labels
target_names = list(set(y_test))
from sklearn.metrics import roc_auc_score
micro_roc_auc_ovr = roc_auc_score(
test_dataset.labels,
y_score,
multi_class="ovr",
average="micro",
)
from sklearn.metrics import roc_curve, auc
# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
for i in range(8):
fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr_grid = np.linspace(0.0, 1.0, 1000)
# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)
for i in range(n_classes):
mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i]) # linear interpolation
# Average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
macro_roc_auc_ovr = roc_auc_score(
y_test,
y_score,
multi_class="ovr",
average="macro",
)
from itertools import cycle
fig, ax = plt.subplots(figsize=(18, 18))
colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for class_id, color in zip(range(8), colors):
RocCurveDisplay.from_predictions(
y_onehot_test[:, class_id],
y_score[:, class_id],
name=f"ROC curve for {target_names[class_id]}",
ax=ax,
)
plt.plot([0, 1], [0, 1], "k--", label="ROC curve for chance level (AUC = 0.5)")
fig.set_size_inches(18.5, 10.5, forward=False)
plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for classes with Area Under the Curve (AOC)")
plt.legend()
plt.show()
Later on we will perform qualitative evaluation, but for now we observe from the roc curves and confusion matrix that the most problematic class is "Opencountry".
Here, we show some examples of common mistakes done by the model.
Wrong Predictions
from visutils import plot_image_samples
plot_image_samples(predictions, test_dataset, figsize=(10, 10), filter_incorrect=True, n_samples=20)
Now we can also observe those more problematic
from visutils import plot_image_samples
plot_image_samples(predictions, test_dataset, figsize=(10, 10), filter_class = 'Opencountry', n_samples=20)
As we observe some mistakes are reasonable, actually, on some images when it missclasifier for "coast", it clearly takes the blue region as if it was the sea, and the ground as if it was sand. Although this category was conflictive because of the coast missclassification, the behaviour is reasonable, and erros correspond to harder samples.
plot_image_samples(predictions, test_dataset, figsize=(10, 10),filter_class='tallbuilding', n_samples=20)
Correct Predictions
from visutils import plot_image_samples
plot_image_samples(predictions, test_dataset, figsize=(10, 10), filter_correct=True, n_samples=20)
This week we are starting to approach a certain threshold of accuracy where it's even hard for humans to distinguish some of the labelled categories; some classes such as city/inside-street/tallbuilding are diffuse, and those related to nature such as mountain and forest are common mistakes.
The gap between human and machine performance is stretching for this problem and, therefore, any of the improvements of the model reflects in a very little scale in accuracy.
We showed up different performances on different parameters and we've to remark that there's no absolute winner, some methods can perform much better if we unconstraint the time consumption, but it becomes unfeasible to study and therefore not optimizable.
SVM and Logistic regressor were the best performers, this is probably because they are separation methods instead of agglomerative (such as KNN from last week). Separation methods are usually better suited for spaces where classes may be mixed up since these outliers outside the proper class region will just have no effect on the correct points. An outlier in KNN can misclassify a sample that was positioned in the proper acceptance zone.
In this matter, SVM performs even better because it's capable of totally removing the effect of the outliers (in contrast to Logistic, which despite tolerating outliers in inference time, it takes them into account in training).
One last remark: the trade-off between computational cost and accuracy should always be taken into consideration when optimizing the model. The results obtained in this study suggest that SVM and Logistic Regressor are good candidates for this image recognition task, but in the next weeks, we'll explore more complex models such as MLPs and CNNs.